feat(awooop): surface ai route repair evidence
This commit is contained in:
@@ -151,6 +151,7 @@ class AiRouteStatusResponse(BaseModel):
|
||||
active_lane: dict[str, Any] | None = None
|
||||
skipped_lanes: list[dict[str, Any]] = Field(default_factory=list)
|
||||
operator_action: dict[str, Any] | None = None
|
||||
repair_evidence: dict[str, Any] | None = None
|
||||
checked_at: datetime
|
||||
|
||||
|
||||
|
||||
@@ -99,6 +99,8 @@ _CICD_STATUS_FILTERS = {"running", "success", "failed", "pending"}
|
||||
_CICD_STAGE_RE = re.compile(r"^[a-z0-9_:-]{1,64}$", re.IGNORECASE)
|
||||
_AI_ROUTE_STATUS_SCHEMA_VERSION = "awooop_ai_route_status_v1"
|
||||
_AI_ROUTE_WORKLOADS = set(get_args(OllamaWorkloadType))
|
||||
_AI_ROUTE_REPAIR_EVIDENCE_PROVIDER = "ai_route_repair"
|
||||
_AI_ROUTE_REPAIR_EVIDENCE_STAGE = "repair_diagnosis"
|
||||
_SOURCE_CORRELATION_SCHEMA_VERSION = "source_provider_correlation_v1"
|
||||
_SOURCE_CORRELATION_PROVIDERS = ("sentry", "signoz")
|
||||
_SOURCE_CORRELATION_EVENT_LIMIT = 200
|
||||
@@ -642,7 +644,7 @@ async def get_ai_route_status(
|
||||
selected_provider=route.primary.provider_name,
|
||||
health=health,
|
||||
))
|
||||
return response
|
||||
return await _ai_route_response_with_repair_evidence(response)
|
||||
|
||||
|
||||
def _validate_ai_route_workload(workload_type: str | None) -> OllamaWorkloadType:
|
||||
@@ -696,7 +698,7 @@ async def _ai_route_lightweight_status_from_policy(
|
||||
route_reason=route_reason,
|
||||
error=str(exc),
|
||||
)
|
||||
return _ai_route_unavailable_status(
|
||||
response = _ai_route_unavailable_status(
|
||||
workload=workload,
|
||||
policy_order=policy_order,
|
||||
checked_at=checked_at,
|
||||
@@ -704,6 +706,7 @@ async def _ai_route_lightweight_status_from_policy(
|
||||
route_error=route_error,
|
||||
route_source="ollama_failover_manager",
|
||||
)
|
||||
return await _ai_route_response_with_repair_evidence(response)
|
||||
|
||||
health_by_provider = {
|
||||
endpoint.provider_name: _ai_route_health_item(report)
|
||||
@@ -741,7 +744,7 @@ async def _ai_route_lightweight_status_from_policy(
|
||||
selected_provider="gemini",
|
||||
health=health_by_provider,
|
||||
))
|
||||
return response
|
||||
return await _ai_route_response_with_repair_evidence(response)
|
||||
|
||||
selected = endpoints[selected_index]
|
||||
model = get_settings().OLLAMA_HEALTH_CHECK_MODEL
|
||||
@@ -783,7 +786,7 @@ async def _ai_route_lightweight_status_from_policy(
|
||||
selected_provider=selected.provider_name,
|
||||
health=health_by_provider,
|
||||
))
|
||||
return response
|
||||
return await _ai_route_response_with_repair_evidence(response)
|
||||
|
||||
|
||||
async def _ai_route_probe_connectivity(
|
||||
@@ -873,6 +876,208 @@ def _ai_route_unavailable_status(
|
||||
return response
|
||||
|
||||
|
||||
async def _ai_route_response_with_repair_evidence(
|
||||
response: dict[str, Any],
|
||||
) -> dict[str, Any]:
|
||||
"""Attach latest read-only repair dossier evidence when a lane is degraded."""
|
||||
response["repair_evidence"] = None
|
||||
if response.get("lane_mode") not in {
|
||||
"degraded_failover",
|
||||
"cloud_fallback",
|
||||
"unavailable",
|
||||
}:
|
||||
return response
|
||||
|
||||
target_provider = _ai_route_repair_evidence_target(response)
|
||||
response["repair_evidence"] = await _latest_ai_route_repair_evidence(
|
||||
target_provider=target_provider,
|
||||
)
|
||||
return response
|
||||
|
||||
|
||||
def _ai_route_repair_evidence_target(response: Mapping[str, Any]) -> str | None:
|
||||
skipped_lanes = response.get("skipped_lanes")
|
||||
if isinstance(skipped_lanes, list):
|
||||
for lane in skipped_lanes:
|
||||
if not isinstance(lane, dict):
|
||||
continue
|
||||
provider_name = str(lane.get("provider_name") or "").strip()
|
||||
if provider_name and lane.get("action_required") is True:
|
||||
return provider_name
|
||||
for lane in skipped_lanes:
|
||||
if isinstance(lane, dict):
|
||||
provider_name = str(lane.get("provider_name") or "").strip()
|
||||
if provider_name:
|
||||
return provider_name
|
||||
policy_order = response.get("policy_order")
|
||||
if isinstance(policy_order, list):
|
||||
for item in policy_order:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
if item.get("runtime") == "ollama":
|
||||
provider_name = str(item.get("provider_name") or "").strip()
|
||||
if provider_name:
|
||||
return provider_name
|
||||
return None
|
||||
|
||||
|
||||
async def _latest_ai_route_repair_evidence(
|
||||
*,
|
||||
project_id: str = "awoooi",
|
||||
target_provider: str | None = None,
|
||||
) -> dict[str, Any] | None:
|
||||
"""Fetch the newest AI route repair diagnosis stored in AwoooP event DB."""
|
||||
params: dict[str, Any] = {
|
||||
"project_id": project_id,
|
||||
"provider": _AI_ROUTE_REPAIR_EVIDENCE_PROVIDER,
|
||||
"stage": _AI_ROUTE_REPAIR_EVIDENCE_STAGE,
|
||||
}
|
||||
target_clause = ""
|
||||
if target_provider:
|
||||
target_clause = """
|
||||
AND COALESCE(
|
||||
NULLIF(source_envelope #>> '{log_correlation,target_resource}', ''),
|
||||
NULLIF(source_envelope #>> '{extra,payload,target_resource}', '')
|
||||
) = :target_provider
|
||||
"""
|
||||
params["target_provider"] = target_provider
|
||||
|
||||
try:
|
||||
item = await _fetch_latest_ai_route_repair_evidence(
|
||||
params=params,
|
||||
target_clause=target_clause,
|
||||
)
|
||||
if item is None and target_provider:
|
||||
params.pop("target_provider", None)
|
||||
item = await _fetch_latest_ai_route_repair_evidence(
|
||||
params=params,
|
||||
target_clause="",
|
||||
)
|
||||
return item
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"ai_route_repair_evidence_fetch_failed",
|
||||
project_id=project_id,
|
||||
target_provider=target_provider,
|
||||
error=str(exc),
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
async def _fetch_latest_ai_route_repair_evidence(
|
||||
*,
|
||||
params: dict[str, Any],
|
||||
target_clause: str,
|
||||
) -> dict[str, Any] | None:
|
||||
sql = text(f"""
|
||||
SELECT
|
||||
event_id,
|
||||
run_id,
|
||||
provider_event_id,
|
||||
source_envelope,
|
||||
provider_ts,
|
||||
received_at
|
||||
FROM awooop_conversation_event
|
||||
WHERE project_id = :project_id
|
||||
AND LOWER(COALESCE(
|
||||
NULLIF(source_envelope->>'provider', ''),
|
||||
NULLIF(split_part(provider_event_id, ':', 1), ''),
|
||||
channel_type
|
||||
)) = :provider
|
||||
AND LOWER(COALESCE(NULLIF(source_envelope->>'stage', ''), 'received')) = :stage
|
||||
{target_clause}
|
||||
ORDER BY received_at DESC, event_id DESC
|
||||
LIMIT 1
|
||||
""")
|
||||
async with get_db_context("awoooi") as db:
|
||||
result = await db.execute(sql, params)
|
||||
row = result.mappings().first()
|
||||
return _ai_route_repair_evidence_item(row) if row else None
|
||||
|
||||
|
||||
def _ai_route_repair_evidence_item(
|
||||
row: Mapping[str, Any],
|
||||
) -> dict[str, Any]:
|
||||
"""Project route-repair source envelopes into a compact operator-safe view."""
|
||||
envelope = _as_dict(row.get("source_envelope"))
|
||||
extra = _as_dict(envelope.get("extra"))
|
||||
payload = _as_dict(extra.get("payload"))
|
||||
log_correlation = _as_dict(envelope.get("log_correlation"))
|
||||
live_probe = _as_dict(payload.get("live_probe"))
|
||||
observed_state = _as_dict(payload.get("observed_state"))
|
||||
side_effects = _ai_route_repair_side_effects(payload.get("side_effects"))
|
||||
|
||||
return {
|
||||
"schema_version": (
|
||||
payload.get("schema_version")
|
||||
or envelope.get("schema_version")
|
||||
or "ai_route_repair_evidence_projection_v1"
|
||||
),
|
||||
"provider": (
|
||||
envelope.get("provider")
|
||||
or str(row.get("provider_event_id") or "").split(":", 1)[0]
|
||||
),
|
||||
"stage": envelope.get("stage") or _AI_ROUTE_REPAIR_EVIDENCE_STAGE,
|
||||
"provider_event_id": row.get("provider_event_id"),
|
||||
"conversation_event_id": _string_or_none(row.get("event_id")),
|
||||
"run_id": _string_or_none(row.get("run_id")),
|
||||
"alertname": log_correlation.get("alertname"),
|
||||
"severity": log_correlation.get("severity"),
|
||||
"fingerprint": log_correlation.get("fingerprint"),
|
||||
"target_resource": (
|
||||
log_correlation.get("target_resource")
|
||||
or payload.get("target_resource")
|
||||
or observed_state.get("target_resource")
|
||||
),
|
||||
"observed_state": observed_state,
|
||||
"live_probe": live_probe,
|
||||
"access_blockers": _as_string_list(payload.get("access_blockers")),
|
||||
"side_effects": side_effects,
|
||||
"source_ref_count": _source_ref_count(envelope),
|
||||
"provider_ts": row.get("provider_ts"),
|
||||
"received_at": row.get("received_at"),
|
||||
}
|
||||
|
||||
|
||||
def _ai_route_repair_side_effects(value: Any) -> dict[str, bool | None]:
|
||||
raw = _as_dict(value)
|
||||
return {
|
||||
"incident_created": _bool_or_none(raw.get("incident_created")),
|
||||
"telegram_sent": _bool_or_none(raw.get("telegram_sent")),
|
||||
"approval_created": _bool_or_none(raw.get("approval_created")),
|
||||
"runtime_route_changed": _bool_or_none(raw.get("runtime_route_changed")),
|
||||
}
|
||||
|
||||
|
||||
def _as_string_list(value: Any) -> list[str]:
|
||||
if isinstance(value, list):
|
||||
return [str(item) for item in value if str(item or "").strip()]
|
||||
if value not in (None, ""):
|
||||
return [str(value)]
|
||||
return []
|
||||
|
||||
|
||||
def _string_or_none(value: Any) -> str | None:
|
||||
if value in (None, ""):
|
||||
return None
|
||||
return str(value)
|
||||
|
||||
|
||||
def _bool_or_none(value: Any) -> bool | None:
|
||||
return value if isinstance(value, bool) else None
|
||||
|
||||
|
||||
def _source_ref_count(envelope: Any) -> int:
|
||||
source_refs = _as_dict(_as_dict(envelope).get("source_refs"))
|
||||
total = 0
|
||||
for value in source_refs.values():
|
||||
if isinstance(value, list):
|
||||
total += len([item for item in value if str(item or "").strip()])
|
||||
elif value not in (None, ""):
|
||||
total += 1
|
||||
return total
|
||||
|
||||
|
||||
def _ai_route_lane_state(
|
||||
*,
|
||||
policy_order: list[dict[str, Any]],
|
||||
|
||||
@@ -21,6 +21,7 @@ from src.services.platform_operator_service import (
|
||||
_ai_route_health_map,
|
||||
_ai_route_lane_state,
|
||||
_ai_route_policy_order,
|
||||
_ai_route_repair_evidence_item,
|
||||
_build_awooop_status_chain,
|
||||
_callback_reply_event_item,
|
||||
_callback_reply_summary_matches_status,
|
||||
@@ -1562,6 +1563,12 @@ def test_ai_route_status_response_preserves_route_fields() -> None:
|
||||
"action": "monitor",
|
||||
"reason": "primary_lane_active",
|
||||
},
|
||||
"repair_evidence": {
|
||||
"provider": "ai_route_repair",
|
||||
"stage": "repair_diagnosis",
|
||||
"target_resource": "ollama_gcp_a",
|
||||
"access_blockers": ["gcloud_compute_instances_get_missing"],
|
||||
},
|
||||
"checked_at": datetime(2026, 5, 19, 12, 0, 0),
|
||||
})
|
||||
|
||||
@@ -1569,6 +1576,79 @@ def test_ai_route_status_response_preserves_route_fields() -> None:
|
||||
assert dumped["policy_order"][-1]["provider_name"] == "gemini"
|
||||
assert dumped["selected_provider"] == "ollama_gcp_a"
|
||||
assert dumped["lane_mode"] == "primary"
|
||||
assert dumped["repair_evidence"]["target_resource"] == "ollama_gcp_a"
|
||||
|
||||
|
||||
def test_ai_route_repair_evidence_item_summarizes_operator_safe_fields() -> None:
|
||||
row = {
|
||||
"event_id": UUID("dff309f0-f159-4537-8f58-47714ce94dca"),
|
||||
"run_id": UUID("ca67ebcc-a24f-53e7-9505-2db15d855ecc"),
|
||||
"provider_event_id": (
|
||||
"ai_route_repair:repair_diagnosis:"
|
||||
"gcp-a-primary-lane-down-20260525T060415Z"
|
||||
),
|
||||
"provider_ts": datetime(2026, 5, 25, 6, 4, 15),
|
||||
"received_at": datetime(2026, 5, 25, 6, 5, 3),
|
||||
"source_envelope": {
|
||||
"provider": "ai_route_repair",
|
||||
"stage": "repair_diagnosis",
|
||||
"log_correlation": {
|
||||
"alertname": "GcpAPrimaryLaneDown",
|
||||
"severity": "warning",
|
||||
"target_resource": "ollama_gcp_a",
|
||||
"fingerprint": "ai-route-gcp-a-primary-down",
|
||||
},
|
||||
"source_refs": {
|
||||
"alert_ids": ["gcp-a-primary-lane-down"],
|
||||
"signoz_alerts": ["signoz:gcp-a"],
|
||||
"sentry_issue_ids": [],
|
||||
"fingerprints": ["ai-route-gcp-a-primary-down"],
|
||||
"run_ids": ["ca67ebcc-a24f-53e7-9505-2db15d855ecc"],
|
||||
},
|
||||
"extra": {
|
||||
"payload": {
|
||||
"schema_version": "ai_route_repair_diagnosis_v1",
|
||||
"observed_state": {
|
||||
"target_resource": "ollama_gcp_a",
|
||||
"lane_mode": "degraded_failover",
|
||||
},
|
||||
"live_probe": {
|
||||
"gcp_a_direct_11434": "connection_refused",
|
||||
"gcp_b_direct_11434": "http_200",
|
||||
},
|
||||
"access_blockers": [
|
||||
"gcloud_compute_instances_get_missing",
|
||||
"gcp_a_ollama_11434_refused",
|
||||
],
|
||||
"side_effects": {
|
||||
"incident_created": False,
|
||||
"telegram_sent": False,
|
||||
"approval_created": False,
|
||||
"runtime_route_changed": False,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
item = _ai_route_repair_evidence_item(row)
|
||||
|
||||
assert item["provider"] == "ai_route_repair"
|
||||
assert item["stage"] == "repair_diagnosis"
|
||||
assert item["target_resource"] == "ollama_gcp_a"
|
||||
assert item["run_id"] == "ca67ebcc-a24f-53e7-9505-2db15d855ecc"
|
||||
assert item["source_ref_count"] == 4
|
||||
assert item["access_blockers"] == [
|
||||
"gcloud_compute_instances_get_missing",
|
||||
"gcp_a_ollama_11434_refused",
|
||||
]
|
||||
assert item["live_probe"]["gcp_a_direct_11434"] == "connection_refused"
|
||||
assert item["side_effects"] == {
|
||||
"incident_created": False,
|
||||
"telegram_sent": False,
|
||||
"approval_created": False,
|
||||
"runtime_route_changed": False,
|
||||
}
|
||||
|
||||
|
||||
def test_ai_route_lane_state_marks_degraded_failover() -> None:
|
||||
@@ -1669,6 +1749,15 @@ async def test_ai_route_status_times_out_before_slow_provider_checks(monkeypatch
|
||||
fake_connectivity,
|
||||
)
|
||||
|
||||
async def no_repair_evidence(**_kwargs):
|
||||
return None
|
||||
|
||||
monkeypatch.setattr(
|
||||
platform_operator_service,
|
||||
"_latest_ai_route_repair_evidence",
|
||||
no_repair_evidence,
|
||||
)
|
||||
|
||||
response = await platform_operator_service.get_ai_route_status("deep_rca")
|
||||
|
||||
assert response["route_reason"] == (
|
||||
@@ -1722,6 +1811,15 @@ async def test_ai_route_status_lightweight_fallback_keeps_gemini_policy_only(
|
||||
fake_offline_connectivity,
|
||||
)
|
||||
|
||||
async def no_repair_evidence(**_kwargs):
|
||||
return None
|
||||
|
||||
monkeypatch.setattr(
|
||||
platform_operator_service,
|
||||
"_latest_ai_route_repair_evidence",
|
||||
no_repair_evidence,
|
||||
)
|
||||
|
||||
response = await platform_operator_service.get_ai_route_status("deep_rca")
|
||||
|
||||
assert response["selected_provider"] == "gemini"
|
||||
|
||||
@@ -261,6 +261,7 @@
|
||||
"modelRoute": "Model route",
|
||||
"routeDetail": "{model}; current {selected}; {primary}={primaryStatus}; fallback {fallback}",
|
||||
"routeLaneDetail": "{mode}; skipped {skipped}",
|
||||
"routeRepairDetail": "Repair evidence: {target}, blockers {blockers}, {sourceRefs} source refs",
|
||||
"routeReasonSeparator": "; ",
|
||||
"routeReason": "Reason: {reason}",
|
||||
"routeErrorDetail": "Route check failed: {error}",
|
||||
@@ -2924,7 +2925,49 @@
|
||||
"inspect_ai_router": "Inspect AI Router / provider status",
|
||||
"unknown": "Confirm next action"
|
||||
},
|
||||
"degradedSummary": "Current handoff is {active}; skipped {skipped}; next action: {action}"
|
||||
"degradedSummary": "Current handoff is {active}; skipped {skipped}; next action: {action}",
|
||||
"repairEvidence": {
|
||||
"title": "Latest repair diagnosis evidence",
|
||||
"meta": "Event info",
|
||||
"target": "Target: {target}",
|
||||
"run": "Run: {run}",
|
||||
"receivedAt": "Stored: {time}",
|
||||
"sourceRefs": "{count} source refs",
|
||||
"blockerTitle": "Current blockers",
|
||||
"sideEffectTitle": "Side-effect check",
|
||||
"sideEffectSeparator": ": ",
|
||||
"emptyValue": "--",
|
||||
"values": {
|
||||
"yes": "yes",
|
||||
"no": "no",
|
||||
"unknown": "not reported"
|
||||
},
|
||||
"sideEffects": {
|
||||
"incident_created": "Incident created",
|
||||
"telegram_sent": "Telegram sent",
|
||||
"approval_created": "Approval created",
|
||||
"runtime_route_changed": "Runtime route changed"
|
||||
},
|
||||
"blockers": {
|
||||
"gcloud_compute_instances_get_missing": "Missing GCP instance get permission",
|
||||
"gcloud_compute_instances_list_missing": "Missing GCP instance list permission",
|
||||
"gcloud_projects_get_iam_policy_missing": "Missing GCP IAM read permission",
|
||||
"gcp_a_ssh_refused": "GCP-A SSH refused",
|
||||
"gcp_a_ollama_11434_refused": "GCP-A Ollama 11434 refused",
|
||||
"proxy_110_11435_http_502": "110 proxy 11435 returned 502",
|
||||
"unknown": "{blocker}"
|
||||
},
|
||||
"probes": {
|
||||
"gcp_a_ping": "GCP-A ping",
|
||||
"gcp_a_ssh_22": "GCP-A SSH 22",
|
||||
"gcp_a_direct_11434": "GCP-A 11434",
|
||||
"gcp_b_direct_11434": "GCP-B 11434",
|
||||
"proxy_110_11435": "110 proxy 11435",
|
||||
"proxy_110_11436": "110 proxy 11436",
|
||||
"proxy_110_11437": "110 proxy 11437",
|
||||
"unknown": "{probe}"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"incidentEvidence": {
|
||||
|
||||
@@ -262,6 +262,7 @@
|
||||
"modelRoute": "模型路由",
|
||||
"routeDetail": "{model};目前 {selected};{primary}={primaryStatus};備援 {fallback}",
|
||||
"routeLaneDetail": "{mode};已跳過 {skipped}",
|
||||
"routeRepairDetail": "修復證據:{target},阻塞 {blockers},來源 {sourceRefs} 筆",
|
||||
"routeReasonSeparator": ";",
|
||||
"routeReason": "原因:{reason}",
|
||||
"routeErrorDetail": "路由檢查失敗:{error}",
|
||||
@@ -2925,7 +2926,49 @@
|
||||
"inspect_ai_router": "需檢查 AI Router / provider 狀態",
|
||||
"unknown": "待確認下一步"
|
||||
},
|
||||
"degradedSummary": "目前由 {active} 接手;已跳過 {skipped};下一步:{action}"
|
||||
"degradedSummary": "目前由 {active} 接手;已跳過 {skipped};下一步:{action}",
|
||||
"repairEvidence": {
|
||||
"title": "最新修復診斷證據",
|
||||
"meta": "事件資訊",
|
||||
"target": "目標:{target}",
|
||||
"run": "Run:{run}",
|
||||
"receivedAt": "入庫:{time}",
|
||||
"sourceRefs": "來源證據 {count} 筆",
|
||||
"blockerTitle": "目前阻塞",
|
||||
"sideEffectTitle": "副作用檢查",
|
||||
"sideEffectSeparator": ":",
|
||||
"emptyValue": "--",
|
||||
"values": {
|
||||
"yes": "有",
|
||||
"no": "無",
|
||||
"unknown": "未回報"
|
||||
},
|
||||
"sideEffects": {
|
||||
"incident_created": "建立 Incident",
|
||||
"telegram_sent": "送出 Telegram",
|
||||
"approval_created": "建立簽核",
|
||||
"runtime_route_changed": "變更 runtime route"
|
||||
},
|
||||
"blockers": {
|
||||
"gcloud_compute_instances_get_missing": "缺 GCP instance get 權限",
|
||||
"gcloud_compute_instances_list_missing": "缺 GCP instance list 權限",
|
||||
"gcloud_projects_get_iam_policy_missing": "缺 GCP IAM 讀取權限",
|
||||
"gcp_a_ssh_refused": "GCP-A SSH 拒絕連線",
|
||||
"gcp_a_ollama_11434_refused": "GCP-A Ollama 11434 拒絕",
|
||||
"proxy_110_11435_http_502": "110 proxy 11435 回 502",
|
||||
"unknown": "{blocker}"
|
||||
},
|
||||
"probes": {
|
||||
"gcp_a_ping": "GCP-A ping",
|
||||
"gcp_a_ssh_22": "GCP-A SSH 22",
|
||||
"gcp_a_direct_11434": "GCP-A 11434",
|
||||
"gcp_b_direct_11434": "GCP-B 11434",
|
||||
"proxy_110_11435": "110 proxy 11435",
|
||||
"proxy_110_11436": "110 proxy 11436",
|
||||
"proxy_110_11437": "110 proxy 11437",
|
||||
"unknown": "{probe}"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"incidentEvidence": {
|
||||
|
||||
@@ -445,6 +445,30 @@ interface AiRouteOperatorAction {
|
||||
reason?: string | null;
|
||||
}
|
||||
|
||||
interface AiRouteRepairEvidence {
|
||||
provider?: string | null;
|
||||
stage?: string | null;
|
||||
provider_event_id?: string | null;
|
||||
conversation_event_id?: string | null;
|
||||
run_id?: string | null;
|
||||
alertname?: string | null;
|
||||
severity?: string | null;
|
||||
fingerprint?: string | null;
|
||||
target_resource?: string | null;
|
||||
observed_state?: Record<string, unknown>;
|
||||
live_probe?: Record<string, unknown>;
|
||||
access_blockers?: string[];
|
||||
side_effects?: {
|
||||
incident_created?: boolean | null;
|
||||
telegram_sent?: boolean | null;
|
||||
approval_created?: boolean | null;
|
||||
runtime_route_changed?: boolean | null;
|
||||
};
|
||||
source_ref_count?: number | null;
|
||||
provider_ts?: string | null;
|
||||
received_at?: string | null;
|
||||
}
|
||||
|
||||
interface AiRouteStatusResponse {
|
||||
schema_version: string;
|
||||
workload_type: string;
|
||||
@@ -461,6 +485,7 @@ interface AiRouteStatusResponse {
|
||||
active_lane?: AiRouteLaneItem | null;
|
||||
skipped_lanes?: AiRouteLaneItem[];
|
||||
operator_action?: AiRouteOperatorAction | null;
|
||||
repair_evidence?: AiRouteRepairEvidence | null;
|
||||
checked_at: string;
|
||||
}
|
||||
|
||||
@@ -2050,6 +2075,49 @@ function aiRouteOperatorActionLabelKey(action?: string | null) {
|
||||
return "operatorActions.unknown";
|
||||
}
|
||||
|
||||
const AI_ROUTE_REPAIR_BLOCKER_KEYS = new Set([
|
||||
"gcloud_compute_instances_get_missing",
|
||||
"gcloud_compute_instances_list_missing",
|
||||
"gcloud_projects_get_iam_policy_missing",
|
||||
"gcp_a_ssh_refused",
|
||||
"gcp_a_ollama_11434_refused",
|
||||
"proxy_110_11435_http_502",
|
||||
]);
|
||||
|
||||
const AI_ROUTE_REPAIR_PROBE_KEYS = new Set([
|
||||
"gcp_a_ping",
|
||||
"gcp_a_ssh_22",
|
||||
"gcp_a_direct_11434",
|
||||
"gcp_b_direct_11434",
|
||||
"proxy_110_11435",
|
||||
"proxy_110_11436",
|
||||
"proxy_110_11437",
|
||||
]);
|
||||
|
||||
const AI_ROUTE_REPAIR_SIDE_EFFECT_KEYS = [
|
||||
"incident_created",
|
||||
"telegram_sent",
|
||||
"approval_created",
|
||||
"runtime_route_changed",
|
||||
] as const;
|
||||
|
||||
function aiRouteRepairBlockerLabelKey(blocker?: string | null) {
|
||||
return blocker && AI_ROUTE_REPAIR_BLOCKER_KEYS.has(blocker)
|
||||
? `repairEvidence.blockers.${blocker}`
|
||||
: "repairEvidence.blockers.unknown";
|
||||
}
|
||||
|
||||
function aiRouteRepairProbeLabelKey(probe?: string | null) {
|
||||
return probe && AI_ROUTE_REPAIR_PROBE_KEYS.has(probe)
|
||||
? `repairEvidence.probes.${probe}`
|
||||
: "repairEvidence.probes.unknown";
|
||||
}
|
||||
|
||||
function compactEvidenceId(value?: string | null) {
|
||||
if (!value) return "--";
|
||||
return value.length > 18 ? `${value.slice(0, 12)}...` : value;
|
||||
}
|
||||
|
||||
function AiRouteStatusPanel({
|
||||
status,
|
||||
error,
|
||||
@@ -2064,6 +2132,9 @@ function AiRouteStatusPanel({
|
||||
const laneMode = status?.lane_mode ?? null;
|
||||
const laneModeKey = aiRouteLaneModeLabelKey(laneMode);
|
||||
const operatorActionKey = aiRouteOperatorActionLabelKey(status?.operator_action?.action);
|
||||
const repairEvidence = status?.repair_evidence ?? null;
|
||||
const repairBlockers = repairEvidence?.access_blockers?.filter(Boolean).slice(0, 4) ?? [];
|
||||
const repairProbes = Object.entries(repairEvidence?.live_probe ?? {}).slice(0, 6);
|
||||
const skippedLanes = status?.skipped_lanes ?? [];
|
||||
const skippedProviderSet = new Set(
|
||||
skippedLanes
|
||||
@@ -2076,6 +2147,12 @@ function AiRouteStatusPanel({
|
||||
minute: "2-digit",
|
||||
})
|
||||
: "--";
|
||||
const repairEvidenceAt = repairEvidence?.received_at
|
||||
? new Date(repairEvidence.received_at).toLocaleTimeString("zh-TW", {
|
||||
hour: "2-digit",
|
||||
minute: "2-digit",
|
||||
})
|
||||
: "--";
|
||||
|
||||
return (
|
||||
<section className="border border-[#e0ddd4] bg-white">
|
||||
@@ -2132,6 +2209,117 @@ function AiRouteStatusPanel({
|
||||
</div>
|
||||
)}
|
||||
|
||||
{repairEvidence && (
|
||||
<div className="border-b border-[#e0ddd4] bg-[#fbfcfb] px-4 py-4">
|
||||
<div className="flex flex-wrap items-start justify-between gap-3">
|
||||
<div className="flex min-w-0 items-center gap-2">
|
||||
<SearchCheck className="h-4 w-4 shrink-0 text-[#17602a]" aria-hidden="true" />
|
||||
<div className="min-w-0">
|
||||
<p className="text-sm font-semibold text-[#141413]">
|
||||
{t("repairEvidence.title")}
|
||||
</p>
|
||||
<p className="mt-1 truncate font-mono text-xs text-[#77736a]">
|
||||
{repairEvidence.provider_event_id ?? "--"}
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
<span className="border border-[#cbdccf] bg-[#f0faf2] px-2 py-0.5 text-xs font-semibold text-[#17602a]">
|
||||
{t("repairEvidence.sourceRefs", {
|
||||
count: repairEvidence.source_ref_count ?? 0,
|
||||
})}
|
||||
</span>
|
||||
</div>
|
||||
|
||||
<div className="mt-4 grid gap-3 lg:grid-cols-3">
|
||||
<div className="border border-[#e0ddd4] bg-white px-3 py-3">
|
||||
<p className="text-xs font-semibold text-[#77736a]">
|
||||
{t("repairEvidence.meta")}
|
||||
</p>
|
||||
<div className="mt-2 space-y-1 text-xs leading-5 text-[#5f5b52]">
|
||||
<p>
|
||||
{t("repairEvidence.target", {
|
||||
target: repairEvidence.target_resource ?? "--",
|
||||
})}
|
||||
</p>
|
||||
<p>
|
||||
{t("repairEvidence.run", {
|
||||
run: compactEvidenceId(repairEvidence.run_id),
|
||||
})}
|
||||
</p>
|
||||
<p>
|
||||
{t("repairEvidence.receivedAt", {
|
||||
time: repairEvidenceAt,
|
||||
})}
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="border border-[#e0ddd4] bg-white px-3 py-3">
|
||||
<p className="text-xs font-semibold text-[#77736a]">
|
||||
{t("repairEvidence.blockerTitle")}
|
||||
</p>
|
||||
<div className="mt-2 flex flex-wrap gap-1.5">
|
||||
{repairBlockers.length > 0 ? repairBlockers.map((blocker) => (
|
||||
<span
|
||||
key={blocker}
|
||||
className="border border-[#d9b36f] bg-[#fff7e8] px-2 py-1 text-xs font-medium text-[#6d4707]"
|
||||
title={blocker}
|
||||
>
|
||||
{AI_ROUTE_REPAIR_BLOCKER_KEYS.has(blocker)
|
||||
? t(aiRouteRepairBlockerLabelKey(blocker) as never)
|
||||
: t("repairEvidence.blockers.unknown", { blocker })}
|
||||
</span>
|
||||
)) : (
|
||||
<span className="text-xs text-[#77736a]">
|
||||
{t("repairEvidence.emptyValue")}
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="border border-[#e0ddd4] bg-white px-3 py-3">
|
||||
<p className="text-xs font-semibold text-[#77736a]">
|
||||
{t("repairEvidence.sideEffectTitle")}
|
||||
</p>
|
||||
<div className="mt-2 space-y-1 text-xs leading-5 text-[#5f5b52]">
|
||||
{AI_ROUTE_REPAIR_SIDE_EFFECT_KEYS.map((key) => {
|
||||
const value = repairEvidence.side_effects?.[key];
|
||||
const valueKey = value === true
|
||||
? "repairEvidence.values.yes"
|
||||
: value === false
|
||||
? "repairEvidence.values.no"
|
||||
: "repairEvidence.values.unknown";
|
||||
return (
|
||||
<p key={key}>
|
||||
{t(`repairEvidence.sideEffects.${key}` as never)}
|
||||
{t("repairEvidence.sideEffectSeparator")}
|
||||
{t(valueKey as never)}
|
||||
</p>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{repairProbes.length > 0 && (
|
||||
<div className="mt-3 grid gap-px bg-[#e0ddd4] md:grid-cols-2 xl:grid-cols-3">
|
||||
{repairProbes.map(([probe, value]) => (
|
||||
<div key={probe} className="bg-white px-3 py-2 text-xs leading-5">
|
||||
<p className="font-semibold text-[#141413]">
|
||||
{AI_ROUTE_REPAIR_PROBE_KEYS.has(probe)
|
||||
? t(aiRouteRepairProbeLabelKey(probe) as never)
|
||||
: t("repairEvidence.probes.unknown", { probe })}
|
||||
</p>
|
||||
<p className="mt-1 truncate font-mono text-[#77736a]" title={String(value ?? "--")}>
|
||||
{String(value ?? "--")}
|
||||
</p>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
|
||||
<div className="grid gap-px bg-[#e0ddd4] md:grid-cols-4">
|
||||
<div className="bg-white px-4 py-3">
|
||||
<p className="text-xs font-semibold text-[#77736a]">{t("fields.workload")}</p>
|
||||
|
||||
@@ -126,6 +126,11 @@ interface AiRouteStatusResponse {
|
||||
action?: string | null
|
||||
human_required?: boolean
|
||||
} | null
|
||||
repair_evidence?: {
|
||||
target_resource?: string | null
|
||||
access_blockers?: string[]
|
||||
source_ref_count?: number | null
|
||||
} | null
|
||||
}
|
||||
|
||||
interface EvidenceSnapshot {
|
||||
@@ -394,8 +399,17 @@ export function AutomationEvidenceCard() {
|
||||
skipped: skippedLanes || '--',
|
||||
})
|
||||
: null
|
||||
const repairEvidence = route?.repair_evidence ?? null
|
||||
const repairBlockers = repairEvidence?.access_blockers?.slice(0, 2).join(', ')
|
||||
const repairDetail = repairEvidence
|
||||
? t('routeRepairDetail', {
|
||||
target: repairEvidence.target_resource ?? '--',
|
||||
blockers: repairBlockers || '--',
|
||||
sourceRefs: repairEvidence.source_ref_count ?? 0,
|
||||
})
|
||||
: null
|
||||
const routeDetail = route?.route_reason && !route.route_error
|
||||
? `${routeSummary}${laneDetail ? t('routeReasonSeparator') + laneDetail : ''}${t('routeReasonSeparator')}${t('routeReason', { reason: route.route_reason })}`
|
||||
? `${routeSummary}${laneDetail ? t('routeReasonSeparator') + laneDetail : ''}${repairDetail ? t('routeReasonSeparator') + repairDetail : ''}${t('routeReasonSeparator')}${t('routeReason', { reason: route.route_reason })}`
|
||||
: routeSummary
|
||||
|
||||
return {
|
||||
|
||||
@@ -1,3 +1,46 @@
|
||||
## 2026-05-25|T177 AI route repair evidence API / 前端投影
|
||||
|
||||
**背景**:
|
||||
|
||||
- T176 已把 GCP-A primary lane down 的 live probe 與阻塞原因寫入 `awooop_conversation_event`,但 operator 在 AwoooP Runs / Dashboard 仍只能看到 `lane_mode=degraded_failover` 與 `repair_skipped_primary_lane`,無法直接看出「診斷證據在哪、阻塞是什麼、是否有副作用」。
|
||||
- 本輪目標不是改 AI Provider 路由,也不是自動重啟 GCP-A;只把既有 AwoooP DB 證據做白名單投影,讓前端可以解釋 GCP-A 為何被跳過。
|
||||
|
||||
**本次修復**:
|
||||
|
||||
- `/api/v1/platform/ai-route-status` 新增 `repair_evidence` 欄位;當 `lane_mode` 為 `degraded_failover` / `cloud_fallback` / `unavailable` 時,從最新 `ai_route_repair / repair_diagnosis` source envelope 取回:
|
||||
- `provider_event_id`、`conversation_event_id`、`run_id`
|
||||
- `target_resource`、`severity`、`fingerprint`
|
||||
- `live_probe`、`access_blockers`
|
||||
- `side_effects`(incident / Telegram / approval / runtime route 是否被建立或變更)
|
||||
- `source_ref_count`
|
||||
- AwoooP Runs 的 AI Provider panel 新增「最新修復診斷證據」區塊,顯示阻塞點、探測結果、來源證據數與副作用檢查。
|
||||
- Dashboard Automation Evidence card 也會把 route degraded 摘要補上 repair evidence target / blockers / source refs。
|
||||
- 前端文案已補 `zh-TW` / `en` i18n;UI 沒有新增 emoji,使用既有 Lucide icon。
|
||||
|
||||
**本地驗證**:
|
||||
|
||||
```text
|
||||
python3 -m py_compile apps/api/src/services/platform_operator_service.py apps/api/src/api/v1/platform/operator_runs.py -> pass
|
||||
jq empty apps/web/messages/zh-TW.json apps/web/messages/en.json -> pass
|
||||
git diff --check -> pass
|
||||
pytest targeted ai-route status/evidence tests -> 6 passed
|
||||
pnpm --dir apps/web exec tsc --noEmit --tsBuildInfoFile /tmp/awoooi-t177-tsconfig.tsbuildinfo -> pass
|
||||
```
|
||||
|
||||
**目前整體進度**:
|
||||
|
||||
- AwoooP 告警可觀測鏈:約 99.3%。
|
||||
- 低風險自動修復閉環:約 95.7%。
|
||||
- 前端 AI 自動化管理介面同步:約 97.3%。
|
||||
- Telegram 詳情 / 歷史可解釋性:約 95.5%。
|
||||
- Callback evidence / DB replayability:約 96.0%。
|
||||
- MCP / 自建 MCP 可見性:約 88%。
|
||||
- Sentry / SigNoz source correlation visibility:約 88%。
|
||||
- Ansible / PlayBook decision visibility:約 84.8%。
|
||||
- KM owner-review / completion governance:約 84%。
|
||||
- AI Provider lane 健康與可見性:約 91%(GCP-A runtime 尚未修復;但 repair diagnosis 已能在 API / 前端呈現)。
|
||||
- 完整 AI 自動化管理產品化:約 95.0%。
|
||||
|
||||
## 2026-05-25|T176 GCP-A primary lane repair evidence 入庫
|
||||
|
||||
**背景**:
|
||||
|
||||
Reference in New Issue
Block a user