diff --git a/apps/api/src/api/v1/approvals.py b/apps/api/src/api/v1/approvals.py index f118d794..687dd9ce 100644 --- a/apps/api/src/api/v1/approvals.py +++ b/apps/api/src/api/v1/approvals.py @@ -234,6 +234,7 @@ async def create_approval( title=f"新授權請求建立: {approval.action[:50]}...", risk_level=approval.risk_level.value, approval_id=str(approval.id), + incident_id=approval.incident_id, ) logger.info( @@ -326,6 +327,7 @@ async def sign_approval( actor_role="signer", risk_level=approval.risk_level.value, approval_id=str(approval_id), + incident_id=approval.incident_id, ) logger.info( @@ -354,6 +356,7 @@ async def sign_approval( actor="OpenClaw", actor_role="executor", approval_id=str(approval_id), + incident_id=approval.incident_id, ) execution_svc = get_execution_service() @@ -461,6 +464,7 @@ async def reject_approval( actor=request.rejector_name, actor_role="rejector", approval_id=str(approval_id), + incident_id=approval.incident_id, ) logger.info( @@ -615,6 +619,7 @@ async def bulk_approve( actor_role="signer", risk_level=signed_approval.risk_level.value, approval_id=approval_id_str, + incident_id=signed_approval.incident_id, ) # 如果觸發執行,加入背景任務 diff --git a/apps/api/src/api/v1/incidents.py b/apps/api/src/api/v1/incidents.py index bcfa11d3..7eac5bb9 100644 --- a/apps/api/src/api/v1/incidents.py +++ b/apps/api/src/api/v1/incidents.py @@ -30,6 +30,7 @@ from src.models.incident import Incident, IncidentStatus, Severity # Phase 16 R3.3b (2026-03-25 台北時區): Repository 層整合 - 已移至 Service 層 from src.services.decision_manager import get_decision_manager from src.services.incident_service import get_incident_service +from src.services.incident_timeline_service import fetch_incident_timeline from src.services.proposal_service import get_proposal_service from src.utils.timezone import now_taipei @@ -92,6 +93,48 @@ class ProposalGenerateResponse(BaseModel): incident_status: str | None = None +class IncidentTimelineEvent(BaseModel): + """事件處理歷程中的一筆原始或合成事件""" + stage: str + status: str + title: str + description: str | None = None + actor: str | None = None + timestamp: str | None = None + source_table: str | None = None + data: dict[str, Any] = Field(default_factory=dict) + + +class IncidentTimelineStage(BaseModel): + """事件處理歷程的標準階段""" + stage: str + label: str + status: str + timestamp: str | None = None + title: str + description: str | None = None + actor: str | None = None + source_table: str | None = None + data: dict[str, Any] = Field(default_factory=dict) + events: list[IncidentTimelineEvent] = Field(default_factory=list) + + +class IncidentTimelineResponse(BaseModel): + """事件完整處理歷程回應""" + incident_id: str + title: str + status: str + severity: str + started_at: str | None = None + updated_at: str | None = None + resolved_at: str | None = None + affected_services: list[str] = Field(default_factory=list) + approval_ids: list[str] = Field(default_factory=list) + timeline: list[IncidentTimelineStage] = Field(default_factory=list) + events: list[IncidentTimelineEvent] = Field(default_factory=list) + ascii_timeline: str + + # ============================================================================= # GET /api/v1/incidents # ============================================================================= @@ -271,6 +314,50 @@ async def get_incident(incident_id: str) -> IncidentResponse: ) from e +# ============================================================================= +# GET /api/v1/incidents/{incident_id}/timeline +# ============================================================================= + +@router.get( + "/{incident_id}/timeline", + response_model=IncidentTimelineResponse, + summary="取得事件完整處理歷程", + description="彙整 webhook、AI、目標、風險、安全閘、執行、驗證、KM 與結案事件。", +) +async def get_incident_timeline(incident_id: str) -> IncidentTimelineResponse: + """ + 取得單一 Incident 的端到端處理歷程。 + """ + try: + timeline = await fetch_incident_timeline(incident_id) + if timeline is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"Incident not found: {incident_id}", + ) + + logger.info( + "incident_timeline_fetched", + incident_id=incident_id, + stage_count=len(timeline.get("timeline", [])), + event_count=len(timeline.get("events", [])), + ) + return IncidentTimelineResponse.model_validate(timeline) + + except HTTPException: + raise + except Exception as e: + logger.exception( + "get_incident_timeline_error", + incident_id=incident_id, + error=str(e), + ) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Failed to get incident timeline: {str(e)}", + ) from e + + # ============================================================================= # POST /api/v1/incidents/{incident_id}/proposal # ============================================================================= diff --git a/apps/api/src/api/v1/timeline.py b/apps/api/src/api/v1/timeline.py index d8ae4e07..64591589 100644 --- a/apps/api/src/api/v1/timeline.py +++ b/apps/api/src/api/v1/timeline.py @@ -25,6 +25,7 @@ logger = get_logger("awoooi.timeline") ) async def get_timeline_events( limit: int = Query(default=100, ge=1, le=200, description="回傳筆數上限"), + incident_id: str | None = Query(default=None, description="只回傳特定 Incident 的事件"), ) -> dict: """ 取得時間軸事件 (後端授權來源) @@ -34,12 +35,13 @@ async def get_timeline_events( count: 事件總數 """ service = get_timeline_service() - events = await service.get_events(limit=limit) + events = await service.get_events(limit=limit, incident_id=incident_id) logger.info( "timeline_events_fetched", count=len(events), limit=limit, + incident_id=incident_id, ) return { diff --git a/apps/api/src/services/approval_db.py b/apps/api/src/services/approval_db.py index 43f37727..5630b696 100644 --- a/apps/api/src/services/approval_db.py +++ b/apps/api/src/services/approval_db.py @@ -921,6 +921,7 @@ class TimelineDBService: actor_role: str | None = None, risk_level: str | None = None, approval_id: str | None = None, + incident_id: str | None = None, ) -> dict[str, Any]: """ 新增時間軸事件 @@ -935,6 +936,7 @@ class TimelineDBService: actor_role=actor_role, risk_level=risk_level, approval_id=approval_id, + incident_id=incident_id, ) db.add(event) await db.flush() @@ -945,6 +947,7 @@ class TimelineDBService: id=event.id, type=event_type, title=title, + incident_id=incident_id, ) return { @@ -952,19 +955,28 @@ class TimelineDBService: "type": event.event_type, "status": event.status, "title": event.title, + "incident_id": event.incident_id, "created_at": event.created_at.isoformat(), } - async def get_events(self, limit: int = 50) -> list[dict[str, Any]]: + async def get_events( + self, + limit: int = 50, + incident_id: str | None = None, + approval_ids: list[str] | None = None, + ) -> list[dict[str, Any]]: """ 取得最近的時間軸事件 """ async with get_db_context() as db: - result = await db.execute( - select(TimelineEvent) - .order_by(TimelineEvent.created_at.desc()) - .limit(limit) - ) + query = select(TimelineEvent) + if incident_id: + from sqlalchemy import or_ + filters = [TimelineEvent.incident_id == incident_id] + if approval_ids: + filters.append(TimelineEvent.approval_id.in_(approval_ids)) + query = query.where(or_(*filters)) + result = await db.execute(query.order_by(TimelineEvent.created_at.desc()).limit(limit)) events = result.scalars().all() return [ @@ -978,6 +990,7 @@ class TimelineDBService: "actor_role": e.actor_role, "risk_level": e.risk_level, "approval_id": e.approval_id, + "incident_id": e.incident_id, "created_at": e.created_at.isoformat(), } for e in events diff --git a/apps/api/src/services/approval_execution.py b/apps/api/src/services/approval_execution.py index 9f7b4801..f1213051 100644 --- a/apps/api/src/services/approval_execution.py +++ b/apps/api/src/services/approval_execution.py @@ -233,6 +233,7 @@ class ApprovalExecutionService: actor="leWOOOgo", actor_role="executor", approval_id=str(approval.id), + incident_id=approval.incident_id, ) # 執行結果 reply 原告警卡片 asyncio.create_task( @@ -269,6 +270,7 @@ class ApprovalExecutionService: actor="leWOOOgo", actor_role="executor", approval_id=str(approval.id), + incident_id=approval.incident_id, ) # Phase 6: 發送失敗通知 (fire-and-forget) @@ -346,6 +348,7 @@ class ApprovalExecutionService: actor="leWOOOgo", actor_role="executor", approval_id=str(approval.id), + incident_id=approval.incident_id, ) await asyncio.sleep(self.RETRY_DELAY_SECONDS) result = await executor.execute_with_audit( @@ -385,6 +388,7 @@ class ApprovalExecutionService: actor="leWOOOgo", actor_role="executor", approval_id=str(approval.id), + incident_id=approval.incident_id, ) # Phase 6: 發送成功通知 (fire-and-forget) @@ -506,6 +510,7 @@ class ApprovalExecutionService: actor="leWOOOgo", actor_role="executor", approval_id=str(approval.id), + incident_id=approval.incident_id, ) # Phase 6: 發送失敗通知 (fire-and-forget, 包含 Dry-Run 攔截) diff --git a/apps/api/src/services/incident_service.py b/apps/api/src/services/incident_service.py index d6d3bab5..ab97b2ca 100644 --- a/apps/api/src/services/incident_service.py +++ b/apps/api/src/services/incident_service.py @@ -398,6 +398,28 @@ async def create_incident_for_approval( error=str(_pg_err), ) + try: + from src.services.approval_db import get_timeline_service + + await get_timeline_service().add_event( + event_type="webhook", + status="success", + title=f"Webhook alert received: {alertname or alert_type}", + description=message, + actor=source, + actor_role="webhook", + risk_level=risk_level, + approval_id=approval_id, + incident_id=incident.incident_id, + ) + except Exception as _timeline_err: + logger.warning( + "incident_timeline_webhook_event_failed", + incident_id=incident.incident_id, + approval_id=approval_id, + error=str(_timeline_err), + ) + logger.info( "incident_created_for_approval", incident_id=incident.incident_id, @@ -897,6 +919,30 @@ class IncidentService: persisted_to_pg=incident.persisted_to_pg, ) + try: + from src.services.approval_db import get_timeline_service + + await get_timeline_service().add_event( + event_type="webhook", + status="success" if redis_success or pg_success else "warning", + title=f"Signal received: {signal.alert_name}", + description=( + signal.annotations.get("message") + or signal.annotations.get("description") + or signal.annotations.get("summary") + ), + actor=signal.source, + actor_role="webhook", + risk_level=incident.severity.value, + incident_id=incident.incident_id, + ) + except Exception as timeline_error: + logger.warning( + "incident_timeline_signal_event_failed", + incident_id=incident.incident_id, + error=str(timeline_error), + ) + return incident except Exception as e: diff --git a/apps/api/src/services/incident_timeline_service.py b/apps/api/src/services/incident_timeline_service.py new file mode 100644 index 00000000..0236b8e5 --- /dev/null +++ b/apps/api/src/services/incident_timeline_service.py @@ -0,0 +1,570 @@ +"""Incident processing timeline aggregation. + +Builds the operator-facing "what happened" timeline from the existing event +tables without adding another schema hop. The raw `timeline_events` table is +still the append-only audit rail; this service composes it with Incident, +Approval, Evidence, Executor, and KM records so a single Incident detail view can +show the full path. +""" + +from __future__ import annotations + +from datetime import datetime +from typing import Any + +import structlog +from sqlalchemy import or_, select + +from src.db.base import get_db_context +from src.db.models import ( + AlertOperationLog, + ApprovalRecord, + AutoRepairExecution, + IncidentEvidence, + IncidentRecord, + KnowledgeEntryRecord, + TimelineEvent, +) + +logger = structlog.get_logger(__name__) + + +STAGE_DEFS: tuple[tuple[str, str], ...] = ( + ("webhook", "Webhook"), + ("investigator", "Investigator"), + ("ai_router", "AI Router"), + ("llm", "LLM"), + ("target", "Target"), + ("blast", "Blast Radius"), + ("safe", "Safety Gate"), + ("executor", "Executor"), + ("verifier", "Verifier"), + ("km", "KM"), + ("close", "Closure"), +) + +_STAGE_LABEL = dict(STAGE_DEFS) +_STATUS_RANK = { + "skipped": 0, + "pending": 1, + "info": 2, + "completed": 3, + "success": 4, + "warning": 5, + "error": 6, +} +_EVENT_STAGE_MAP = { + "webhook": "webhook", + "alert": "webhook", + "system": "safe", + "agent": "llm", + "ai_router": "ai_router", + "llm": "llm", + "mcp_call": "investigator", + "investigator": "investigator", + "target": "target", + "blast": "blast", + "security": "safe", + "safe": "safe", + "human": "safe", + "exec": "executor", + "executor": "executor", + "verify": "verifier", + "verifier": "verifier", + "km": "km", + "learn": "km", + "close": "close", + "resolved": "close", +} + + +def _value(value: Any) -> Any: + return value.value if hasattr(value, "value") else value + + +def _iso(value: Any) -> str | None: + if isinstance(value, datetime): + return value.isoformat() + return None + + +def _compact(value: str | None, max_len: int = 500) -> str | None: + if not value: + return value + return value if len(value) <= max_len else f"{value[:max_len - 3]}..." + + +def _event( + *, + stage: str, + status: str, + title: str, + timestamp: Any = None, + description: str | None = None, + actor: str | None = None, + source_table: str, + data: dict[str, Any] | None = None, +) -> dict[str, Any]: + return { + "stage": stage, + "status": status, + "title": title, + "description": _compact(description), + "actor": actor, + "timestamp": _iso(timestamp), + "source_table": source_table, + "data": data or {}, + } + + +def _empty_stage(stage: str, label: str) -> dict[str, Any]: + return { + "stage": stage, + "label": label, + "status": "skipped", + "timestamp": None, + "title": f"{label} not recorded", + "description": None, + "actor": None, + "source_table": None, + "data": {}, + "events": [], + } + + +def _apply_event(stages: dict[str, dict[str, Any]], event: dict[str, Any]) -> None: + stage_name = event["stage"] + stage = stages.get(stage_name) + if stage is None: + return + + stage["events"].append(event) + current_rank = _STATUS_RANK.get(stage["status"], 0) + incoming_rank = _STATUS_RANK.get(event["status"], 0) + if incoming_rank >= current_rank: + stage.update({ + "status": event["status"], + "timestamp": event["timestamp"] or stage["timestamp"], + "title": event["title"], + "description": event["description"], + "actor": event["actor"], + "source_table": event["source_table"], + "data": event["data"], + }) + elif stage["timestamp"] is None and event["timestamp"]: + stage["timestamp"] = event["timestamp"] + + +def _stage_from_event_type(event_type: str | None) -> str: + return _EVENT_STAGE_MAP.get((event_type or "").lower(), "safe") + + +def format_ascii_timeline(stages: list[dict[str, Any]]) -> str: + """Compact ASCII line for Telegram and logs.""" + marks = { + "success": "ok", + "completed": "ok", + "info": "ok", + "warning": "warn", + "error": "fail", + "pending": "wait", + "skipped": "skip", + } + parts = [ + f"{stage['stage']}:{marks.get(stage['status'], stage['status'])}" + for stage in stages + if stage["status"] != "skipped" + ] + return " > ".join(parts) if parts else "webhook:skip > ai:skip > executor:skip" + + +async def fetch_incident_timeline(incident_id: str) -> dict[str, Any] | None: + """Return a complete detail timeline for one incident.""" + stages = {stage: _empty_stage(stage, label) for stage, label in STAGE_DEFS} + + async with get_db_context() as db: + incident = ( + await db.execute( + select(IncidentRecord).where(IncidentRecord.incident_id == incident_id) + ) + ).scalar_one_or_none() + if incident is None: + return None + + approvals = ( + await db.execute( + select(ApprovalRecord) + .where(ApprovalRecord.incident_id == incident_id) + .order_by(ApprovalRecord.created_at.asc()) + ) + ).scalars().all() + approval_ids = [str(a.id) for a in approvals] + + evidence_records = ( + await db.execute( + select(IncidentEvidence) + .where(IncidentEvidence.incident_id == incident_id) + .order_by(IncidentEvidence.collected_at.asc()) + ) + ).scalars().all() + + executions = ( + await db.execute( + select(AutoRepairExecution) + .where(AutoRepairExecution.incident_id == incident_id) + .order_by(AutoRepairExecution.created_at.asc()) + ) + ).scalars().all() + + km_entries = ( + await db.execute( + select(KnowledgeEntryRecord) + .where(KnowledgeEntryRecord.related_incident_id == incident_id) + .order_by(KnowledgeEntryRecord.created_at.asc()) + ) + ).scalars().all() + + timeline_filter = TimelineEvent.incident_id == incident_id + if approval_ids: + timeline_filter = or_(timeline_filter, TimelineEvent.approval_id.in_(approval_ids)) + raw_timeline = ( + await db.execute( + select(TimelineEvent) + .where(timeline_filter) + .order_by(TimelineEvent.created_at.asc()) + ) + ).scalars().all() + + aol_filter = AlertOperationLog.incident_id == incident_id + if approval_ids: + aol_filter = or_(aol_filter, AlertOperationLog.approval_id.in_(approval_ids)) + alert_ops = ( + await db.execute( + select(AlertOperationLog) + .where(aol_filter) + .order_by(AlertOperationLog.created_at.asc()) + .limit(100) + ) + ).scalars().all() + + events: list[dict[str, Any]] = [] + + alert_name = incident.alertname + if not alert_name and incident.signals: + first_signal = incident.signals[0] if isinstance(incident.signals, list) else {} + alert_name = first_signal.get("alert_name") or first_signal.get("labels", {}).get("alertname") + + events.append(_event( + stage="webhook", + status="completed", + title=f"Alert received: {alert_name or 'unknown'}", + timestamp=incident.created_at, + description=f"source={_signal_source(incident.signals)} severity={_value(incident.severity)}", + actor=_signal_source(incident.signals) or "alertmanager", + source_table="incidents", + data={ + "alertname": alert_name, + "severity": _value(incident.severity), + "signals": incident.signals or [], + "affected_services": incident.affected_services or [], + }, + )) + + for evidence in evidence_records: + status = "completed" if (evidence.sensors_succeeded or 0) > 0 else "warning" + events.append(_event( + stage="investigator", + status=status, + title="Evidence snapshot collected", + timestamp=evidence.collected_at, + description=evidence.evidence_summary, + actor="pre_decision_investigator", + source_table="incident_evidence", + data={ + "sensors_attempted": evidence.sensors_attempted, + "sensors_succeeded": evidence.sensors_succeeded, + "duration_ms": evidence.collection_duration_ms, + "mcp_health": evidence.mcp_health, + }, + )) + if evidence.verification_result: + verification_status = ( + "success" if evidence.verification_result == "success" + else "warning" if evidence.verification_result == "degraded" + else "error" + ) + events.append(_event( + stage="verifier", + status=verification_status, + title=f"Post-execution verification: {evidence.verification_result}", + timestamp=evidence.collected_at, + description=evidence.self_healing_detail and str(evidence.self_healing_detail), + actor="post_execution_verifier", + source_table="incident_evidence", + data={ + "verification_result": evidence.verification_result, + "self_healing_score": evidence.self_healing_score, + "self_healing_detail": evidence.self_healing_detail, + }, + )) + + for approval in approvals: + metadata = approval.extra_metadata or {} + provider = metadata.get("source") or _provider_from_description(approval.description) + if provider: + events.append(_event( + stage="ai_router", + status="completed", + title=f"AI route selected: {provider}", + timestamp=approval.created_at, + description=approval.description, + actor="ai_router", + source_table="approval_records", + data={ + "provider": provider, + "confidence_score": metadata.get("confidence_score"), + "is_rule_based": metadata.get("is_rule_based"), + }, + )) + events.append(_event( + stage="llm", + status="completed", + title=f"LLM proposal generated: {provider}", + timestamp=approval.created_at, + description=approval.description, + actor=provider, + source_table="approval_records", + data={ + "approval_id": approval.id, + "matched_playbook_id": approval.matched_playbook_id, + "playbook_id": metadata.get("playbook_id"), + }, + )) + + events.append(_event( + stage="target", + status="completed", + title="Target resource selected", + timestamp=approval.created_at, + description=approval.action, + actor=approval.requested_by, + source_table="approval_records", + data={"action": approval.action}, + )) + + events.append(_event( + stage="blast", + status="completed" if approval.blast_radius else "warning", + title="Blast radius evaluated", + timestamp=approval.created_at, + description=None, + actor=approval.requested_by, + source_table="approval_records", + data=approval.blast_radius or {}, + )) + + events.append(_event( + stage="safe", + status=_approval_status_to_timeline_status(approval.status), + title=f"Safety gate: {_value(approval.risk_level)} / {_value(approval.status)}", + timestamp=approval.created_at, + description=_dry_run_summary(approval.dry_run_checks), + actor=approval.requested_by, + source_table="approval_records", + data={ + "approval_id": approval.id, + "risk_level": _value(approval.risk_level), + "status": _value(approval.status), + "required_signatures": approval.required_signatures, + "current_signatures": approval.current_signatures, + "dry_run_checks": approval.dry_run_checks or [], + }, + )) + + if str(_value(approval.status)).startswith("execution_"): + success = _value(approval.status) == "execution_success" + events.append(_event( + stage="executor", + status="success" if success else "error", + title="Approval execution completed", + timestamp=approval.resolved_at or approval.updated_at, + description=approval.rejection_reason, + actor="approval_execution", + source_table="approval_records", + data={ + "approval_id": approval.id, + "status": _value(approval.status), + }, + )) + + for execution in executions: + events.append(_event( + stage="executor", + status="success" if execution.success else "error", + title=f"Auto repair execution: {execution.playbook_name}", + timestamp=execution.created_at, + description=execution.error_message, + actor=execution.triggered_by, + source_table="auto_repair_executions", + data={ + "playbook_id": execution.playbook_id, + "success": execution.success, + "execution_time_ms": execution.execution_time_ms, + "similarity_score": execution.similarity_score, + "risk_level": execution.risk_level, + "executed_steps": execution.executed_steps, + }, + )) + + for entry in km_entries: + events.append(_event( + stage="km", + status="completed", + title=f"KM entry written: {entry.title}", + timestamp=entry.created_at, + description=entry.content, + actor=entry.created_by or _value(entry.source), + source_table="knowledge_entries", + data={ + "knowledge_id": entry.id, + "entry_type": _value(entry.entry_type), + "status": _value(entry.status), + "path_type": entry.path_type, + "related_approval_id": entry.related_approval_id, + }, + )) + + if incident.resolved_at or incident.closed_at: + events.append(_event( + stage="close", + status="success", + title=f"Incident {_value(incident.status)}", + timestamp=incident.closed_at or incident.resolved_at, + description=None, + actor="incident_service", + source_table="incidents", + data={ + "status": _value(incident.status), + "outcome": incident.outcome, + "resolved_at": _iso(incident.resolved_at), + "closed_at": _iso(incident.closed_at), + }, + )) + + for raw in raw_timeline: + events.append(_event( + stage=_stage_from_event_type(raw.event_type), + status=raw.status, + title=raw.title, + timestamp=raw.created_at, + description=raw.description, + actor=raw.actor, + source_table="timeline_events", + data={ + "timeline_event_id": raw.id, + "event_type": raw.event_type, + "approval_id": raw.approval_id, + "actor_role": raw.actor_role, + "risk_level": raw.risk_level, + }, + )) + + for op in alert_ops: + events.append(_event( + stage=_stage_from_aol(op.event_type), + status="error" if op.success is False else "success" if op.success is True else "info", + title=f"AOL: {_value(op.event_type)}", + timestamp=op.created_at, + description=op.action_detail or op.error_message, + actor=op.actor, + source_table="alert_operation_log", + data={ + "operation_id": op.id, + "event_type": _value(op.event_type), + "approval_id": op.approval_id, + "context": op.context, + }, + )) + + events.sort(key=lambda e: e["timestamp"] or "") + for event in events: + _apply_event(stages, event) + + stage_list = [stages[stage] for stage, _ in STAGE_DEFS] + result = { + "incident_id": incident.incident_id, + "title": alert_name or incident.incident_id, + "status": _value(incident.status), + "severity": _value(incident.severity), + "started_at": _iso(incident.created_at), + "updated_at": _iso(incident.updated_at), + "resolved_at": _iso(incident.resolved_at), + "affected_services": incident.affected_services or [], + "approval_ids": approval_ids, + "timeline": stage_list, + "events": events, + "ascii_timeline": format_ascii_timeline(stage_list), + } + logger.info( + "incident_timeline_fetched", + incident_id=incident_id, + stages_recorded=sum(1 for stage in stage_list if stage["status"] != "skipped"), + event_count=len(events), + ) + return result + + +def _signal_source(signals: Any) -> str | None: + if not signals or not isinstance(signals, list): + return None + first_signal = signals[0] if signals else {} + if not isinstance(first_signal, dict): + return None + return first_signal.get("source") + + +def _provider_from_description(description: str | None) -> str | None: + if not description: + return None + if description.startswith("[AI:"): + return description.split("]", 1)[0].replace("[AI:", "").strip() + return None + + +def _approval_status_to_timeline_status(status: Any) -> str: + value = str(_value(status)) + if value in {"rejected", "expired"}: + return "error" + if value in {"approved", "execution_success"}: + return "success" + if value == "execution_failed": + return "warning" + return "info" + + +def _dry_run_summary(checks: Any) -> str | None: + if not checks: + return None + passed = 0 + total = 0 + for check in checks: + if isinstance(check, dict): + total += 1 + if check.get("passed"): + passed += 1 + return f"Dry-run checks: {passed}/{total} passed" if total else None + + +def _stage_from_aol(event_type: Any) -> str: + value = str(_value(event_type)).upper() + if value == "ALERT_RECEIVED": + return "webhook" + if value in {"PRE_FLIGHT_PASSED", "PRE_FLIGHT_FAILED", "GUARDRAIL_BLOCKED"}: + return "safe" + if value in {"EXECUTION_STARTED", "EXECUTION_COMPLETED", "AUTO_REPAIR_TRIGGERED", "CHANGE_APPLIED"}: + return "executor" + if value in {"TELEGRAM_SENT", "TELEGRAM_RESULT_SENT", "USER_ACTION", "APPROVAL_ESCALATED"}: + return "safe" + if value == "RESOLVED": + return "close" + return "safe" diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index dc815c42..4771b3a1 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -374,6 +374,7 @@ class TelegramMessage: f"{self.status_emoji} ACTION REQUIRED | {html.escape(risk_zh)}\n" f"──────────────────────\n" f"📋 {html.escape(incident_id)}\n" + f"流程:webhook>investigator>ai>safe>executor>verifier>km\n" f"🎯 資源:{safe_resource}\n" f"{category_line}" f"\n" @@ -4593,6 +4594,7 @@ class TelegramGateway: """ # 延遲 import 避免循環依賴 (與 approval_service 同一模式) from src.repositories.incident_repository import get_incident_repository + from src.services.incident_timeline_service import fetch_incident_timeline try: repo = get_incident_repository() @@ -4606,8 +4608,8 @@ class TelegramGateway: confidence_bar = "█" * int((dc.confidence if dc else 0) * 10) + "░" * (10 - int((dc.confidence if dc else 0) * 10)) lines = [ - f"📋 事件詳情", - f"", + "📋 事件詳情", + "", f"🔖 ID: {html.escape(incident.incident_id)}", f"📊 狀態: {incident.status.value}", f"⚡ 嚴重度: {incident.severity.value}", @@ -4618,7 +4620,7 @@ class TelegramGateway: if dc: lines += [ - f"", + "", f"🤖 AI 分析 ({html.escape(dc.model_used)})", f"💡 {html.escape(dc.hypothesis)}", f"📈 信心: [{confidence_bar}] {dc.confidence:.0%}", @@ -4630,7 +4632,7 @@ class TelegramGateway: from zoneinfo import ZoneInfo created_taipei = incident.created_at.astimezone(ZoneInfo("Asia/Taipei")) if incident.created_at else incident.created_at lines += [ - f"", + "", f"🕐 建立: {created_taipei.strftime('%m/%d %H:%M') if created_taipei else 'N/A'}", ] @@ -4638,6 +4640,14 @@ class TelegramGateway: fs = incident.frequency_stats lines.append(f"📉 頻率: 1h={fs.count_1h} 24h={fs.count_24h} 7d={fs.count_7d}") + timeline = await fetch_incident_timeline(incident_id) + if timeline and timeline.get("ascii_timeline"): + lines += [ + "", + "🧭 處理歷程", + f"{html.escape(timeline['ascii_timeline'])}", + ] + await self.send_notification("\n".join(lines)) except Exception as e: diff --git a/apps/api/tests/test_incident_timeline_service.py b/apps/api/tests/test_incident_timeline_service.py new file mode 100644 index 00000000..496e8041 --- /dev/null +++ b/apps/api/tests/test_incident_timeline_service.py @@ -0,0 +1,25 @@ +from src.services.incident_timeline_service import STAGE_DEFS, format_ascii_timeline + + +def _stages(status_by_stage: dict[str, str]) -> list[dict]: + return [ + {"stage": stage, "status": status_by_stage.get(stage, "skipped")} + for stage, _label in STAGE_DEFS + ] + + +def test_format_ascii_timeline_skips_unrecorded_stages() -> None: + stages = _stages({ + "webhook": "completed", + "ai_router": "success", + "executor": "error", + "km": "pending", + }) + + assert format_ascii_timeline(stages) == ( + "webhook:ok > ai_router:ok > executor:fail > km:wait" + ) + + +def test_format_ascii_timeline_has_empty_fallback() -> None: + assert format_ascii_timeline(_stages({})) == "webhook:skip > ai:skip > executor:skip" diff --git a/apps/web/src/components/incident/incident-card.tsx b/apps/web/src/components/incident/incident-card.tsx index e3914ab6..297b1f95 100644 --- a/apps/web/src/components/incident/incident-card.tsx +++ b/apps/web/src/components/incident/incident-card.tsx @@ -15,7 +15,7 @@ import React, { useState, useCallback, useRef, useEffect } from 'react' import { useTranslations } from 'next-intl' -import type { IncidentResponse, DecisionInfo } from '@/lib/api-client' +import type { IncidentResponse, DecisionInfo, IncidentTimelineResponse } from '@/lib/api-client' import { apiClient } from '@/lib/api-client' import { CURRENT_USER } from '@/lib/constants' import { useCSRF } from '@/hooks/useCSRF' @@ -73,6 +73,34 @@ function formatDuration(createdAt: string | undefined): string { } } +function statusColor(status: string): string { + switch (status) { + case 'success': + case 'completed': + return '#16a34a' + case 'warning': + return '#d97000' + case 'error': + return '#cc2200' + case 'pending': + return '#87867f' + default: + return '#4A90D9' + } +} + +function formatTimelineTime(value: string | null): string { + if (!value) return '--' + try { + return new Date(value).toLocaleTimeString('zh-TW', { + hour: '2-digit', + minute: '2-digit', + }) + } catch { + return '--' + } +} + // ============================================================================= // 2026-04-02 Claude Code: Phase R-UI2 handleApprove/Reject 重複邏輯抽取 // useApprovalAction — 統一 setup/teardown:loading 狀態、timeout、error 處理 @@ -139,6 +167,10 @@ export function IncidentCard({ incident, decision, onApprovalChange }: IncidentC const [currentProposalId, setCurrentProposalId] = useState(null) const [aiExpanded, setAiExpanded] = useState(false) + const [timelineExpanded, setTimelineExpanded] = useState(false) + const [timelineData, setTimelineData] = useState(null) + const [timelineLoading, setTimelineLoading] = useState(false) + const [timelineError, setTimelineError] = useState(null) const incidentStatus = incident.status as string const sev = incident.severity as keyof typeof SEV_CONFIG @@ -154,6 +186,13 @@ export function IncidentCard({ incident, decision, onApprovalChange }: IncidentC const decisionAction = decision?.proposal_data?.action ?? '' const decisionReasoning = decision?.proposal_data?.reasoning ?? '' + useEffect(() => { + setTimelineExpanded(false) + setTimelineData(null) + setTimelineError(null) + setTimelineLoading(false) + }, [incident.incident_id]) + // ── 解析 proposalId(approve/reject 共用前置步驟)───────────────────────── const resolveProposalId = useCallback(async (): Promise => { let approvalId = currentProposalId @@ -208,6 +247,23 @@ export function IncidentCard({ incident, decision, onApprovalChange }: IncidentC const handleApprove = approveHook.execute const handleReject = rejectHook.execute + const handleTimelineToggle = useCallback(async () => { + const nextExpanded = !timelineExpanded + setTimelineExpanded(nextExpanded) + if (!nextExpanded || timelineData || timelineLoading) return + + setTimelineLoading(true) + setTimelineError(null) + try { + const timeline = await apiClient.getIncidentTimeline(incident.incident_id) + setTimelineData(timeline) + } catch (error) { + setTimelineError(error instanceof Error ? error.message : String(error)) + } finally { + setTimelineLoading(false) + } + }, [incident.incident_id, timelineData, timelineExpanded, timelineLoading]) + // ── 授權按鈕渲染 ─────────────────────────────────────────────────────────── const renderApproveButtons = () => { switch (buttonState) { @@ -349,6 +405,108 @@ export function IncidentCard({ incident, decision, onApprovalChange }: IncidentC + + {timelineExpanded && ( +
+ {timelineLoading && ( +
載入處理歷程...
+ )} + {timelineError && ( +
{timelineError}
+ )} + {!timelineLoading && !timelineError && timelineData && ( + <> +
+ {timelineData.ascii_timeline} +
+
+ {timelineData.timeline + .filter(stage => stage.status !== 'skipped') + .map(stage => ( +
+ + {stage.stage} + + + {stage.title} + + + {formatTimelineTime(stage.timestamp)} + +
+ ))} +
+ + )} +
+ )} + {/* AI 提案行(可展開)*/} {decisionAction && ( <> diff --git a/apps/web/src/lib/api-client.ts b/apps/web/src/lib/api-client.ts index 231a61d4..8d670426 100644 --- a/apps/web/src/lib/api-client.ts +++ b/apps/web/src/lib/api-client.ts @@ -175,6 +175,11 @@ export const apiClient = { return handleResponse(res) }, + async getIncidentTimeline(incidentId: string) { + const res = await fetch(`${API_BASE_URL}/incidents/${incidentId}/timeline`) + return handleResponse(res) + }, + async generateProposal(incidentId: string) { const res = await fetch(`${API_BASE_URL}/incidents/${incidentId}/proposal`, { method: 'POST', @@ -279,6 +284,37 @@ export interface IncidentListResponse { incidents: IncidentResponse[] } +export interface IncidentTimelineEvent { + stage: string + status: string + title: string + description: string | null + actor: string | null + timestamp: string | null + source_table: string | null + data: Record +} + +export interface IncidentTimelineStage extends IncidentTimelineEvent { + label: string + events: IncidentTimelineEvent[] +} + +export interface IncidentTimelineResponse { + incident_id: string + title: string + status: string + severity: string + started_at: string | null + updated_at: string | null + resolved_at: string | null + affected_services: string[] + approval_ids: string[] + timeline: IncidentTimelineStage[] + events: IncidentTimelineEvent[] + ascii_timeline: string +} + export interface BlastRadius { affected_pods: number estimated_downtime: string diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 7598a2c7..a7eec6bc 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -6,6 +6,21 @@ --- +## 2026-04-29 | Wave B 事件處理歷程透明化 + +Codex 接續 AI 自動化 Wave B,先把「告警→AI→安全閘→執行→驗證→KM」處理鏈變成可查、可顯示、可發 Telegram 的事件 timeline。 + +### 完成 +- 新增 Incident timeline 聚合 service,從 incidents、approvals、EvidenceSnapshot、AutoRepairExecution、timeline_events、AOL、KM entries 組成 11 階段處理歷程。 +- 新增 `GET /api/v1/incidents/{incident_id}/timeline`,並讓 `timeline_events` 支援 `incident_id` 關聯查詢。 +- Incident 建立、Approval 簽核、executor 狀態寫入時補 incident 關聯,讓後續事件能回掛單一 incident。 +- WarRoom IncidentCard 增加「處理歷程」展開區,Telegram 處置卡與事件詳情補 ASCII timeline。 + +### 驗證 +- `py_compile` timeline/API/service/Telegram 相關檔案通過。 +- `pytest tests/test_incident_timeline_service.py tests/test_action_parser_safety.py -q` 通過。 +- `pnpm --filter @awoooi/web typecheck` 通過。 + ## 🔴 2026-04-29 | LLM 飛輪復活戰 — 推翻 A2 + CD blocker 連環解 統帥訊息:「2 個月在原地打轉」「Claude Code 浪費我兩個月訂閱費」+「主要優先用 111 主機的 Ollama」。